# the purpose of this document is to conduct some data analysis establishing who
# talks about Critical Race Theory

# I use three datasets: DCInbox, Google Trends, 
# and Stanford Cable TV News Analyzer

####### LIBRARIES ####
library(here)
library(dplyr)
library(ggplot2)
library(stringr)
library(stringi)
library(dataverse)
library(tidyverse)
library(janitor)
library(anytime)
library(gridExtra)

###### LOAD DATA AND BASIC CLEANING ####

# DCINBOX # 
dcinbox <- read.csv(here("dcinbox_export.csv"))
  # Downloaded May 3, 2023. This dataset updates monthly. 
  # It is large

# GOOGLE TRENDS #
trends <- read.csv(here("multiTimeline.csv"),
                   stringsAsFactors = F,
                   strip.white = T) %>%
  rownames_to_column() %>%
  row_to_names(., 1) %>%
  rename(crt_us = "critical race theory: (United States)") %>%
  mutate(crt_us = case_when(
    crt_us == "<1" ~ "0",
    TRUE ~ crt_us),
    crt_us = as.numeric(crt_us),
    Month_temp = anydate(.$Month))

# STANFORD CABLE TV NEWS #
tvnews <- read.csv(here("tvnews_20230504022350.csv")) %>%
  mutate(channel = case_when(
    Query == "channel=\"FOX\" AND text=\"critical race theory|Critical Race Theory|CRT\"" ~ "FOX",
    Query == "channel=\"CNN\" AND text=\"critical race theory|Critical Race Theory|CRT\""  ~ "CNN",
    Query == "channel=\"MSNBC\" AND text=\"critical race theory|Critical Race Theory|CRT\"" ~ "MSNBC"),
    
    date = as.Date(Time, format = "%Y-%m-%d"),
    channel = factor(channel, levels = c("FOX", "CNN", "MSNBC")),
    minutes = Value/60)

##### DCINBOX: MORE DATA CLEANING ####
# Dealing with Date objects 
dcinbox_clean <- dcinbox %>%
  mutate(clean_date_time = as.POSIXct(Unix.Timestamp/1000, 
                                      origin="1970-01-01"),
         clean_date = as.Date(clean_date_time),
         clean_year = format(as.Date(clean_date, 
                                     format="%Y-%m-%d"),"%Y"),
         clean_month_year = format(as.Date(clean_date, 
                                           format="%Y-%m-%d"),"%m-%Y"))

# Searching for newsletters that include CRT
dcinbox_CRT <- dcinbox_clean %>% 
  filter(grepl("Critical Race Theory|CRT|critical race theory|CRITICAL RACE THEORY", 
               Body, ignore.case = T))
  # this takes a bit to run  

# Creating different plotting data frames
dcinbox_plot <- dcinbox_CRT %>%
  group_by(Party, Congress) %>% # group_by party and time
  summarize(n = n()) %>%
  ungroup() %>%
  add_row(Party = "Democrat", Congress = 118, n = 0) %>%
  mutate(Party = factor(Party, levels = c("Republican", "Democrat")))

dcinbox_plot_year <- dcinbox_CRT %>%
  group_by(Party, clean_year) %>% # group_by party and time
  summarize(n = n()) %>%
  ungroup() %>%
  add_row(Party = "Democrat", clean_year = "2023", n = 0) %>%
  mutate(Party = factor(Party, levels = c("Republican", "Democrat")))

dcinbox_plot_monthyear <- dcinbox_CRT %>%
  group_by(Party, clean_month_year) %>% # group_by party and time
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(Party = factor(Party, levels = c("Republican", "Democrat"))) 

dcinbox_total <- dcinbox_clean %>%
  group_by(Party, clean_year) %>% # group_by party and time
  summarize(total_n = n()) %>%
  mutate(Party = factor(Party, levels = c("Republican", "Democrat"))) %>%
  filter(!is.na(Party)) 

dc_merge <- left_join(dcinbox_total, dcinbox_plot_year) %>%
  mutate(n = case_when(
  is.na(n) ~ 0,
  TRUE ~ n),
  prop_crt = (n/total_n)*100) 

##### INTERMEDIATE PLOTS AND PREP ####

# By party over time (year)
newsletter_plot <- ggplot(dcinbox_plot_year, aes(x = factor(clean_year), y = n, fill = Party)) +
  geom_bar(position = "dodge", stat = "identity")+
  theme_bw() + 
  theme(legend.title = element_blank(),
        plot.caption = element_text(face = "italic"),
        panel.grid.minor = element_blank()) + 
  ggtitle("Number of Congressional newsletters that mention CRT: by year and party") +
  xlab("Year") +
  ylab("Count") + 
  labs(caption = "Data sourced from DCInbox (pulled May 2023).\nNewsletters included contain the term 'CRT'\n or 'Critical Race Theory'(not case sensitive).")


tvnews_plot <- ggplot(tvnews, aes(x = date, y = minutes, group = channel, color = channel)) +
  geom_line() +
  theme_minimal() +
  ylab("Minutes per month") + 
  xlab("Date") + 
  geom_vline(xintercept = as.Date("2021-06-01"),
             linetype = "dashed",
             color = "deep pink") + 
  ggtitle("Minutes per month devoted to Critical Race Theory by TV channel") + 
  theme(legend.title = element_blank(),
        plot.caption = element_text(face = "italic"),
        panel.grid.minor = element_blank())+
  labs(caption = "Data from Stanford Cable TV News Analyzer. Keyword terms were\n 'Critical Race Theory' and 'CRT', not case sensitive. Minutes per month\n are calculated via text querying captions.") +
  scale_x_date(date_labels = "%b %Y",
               date_breaks = "4 month") +
  scale_y_continuous(breaks = seq(0,25, by = 5))

##### FINAL PLOTS ####
# Figure 1 (Newsletter and TV Channel CRT plot)
# Placing the newsletter and channel plots in one frame
grid.arrange(newsletter_plot + 
               theme(legend.position = "top",
                     axis.text.x = element_text(angle = 45, hjust = 0.75),
                     legend.key.size = unit(0.3, "cm")) + 
               
               ggtitle("Number of Congressional\nnewsletters that mention CRT"), 
              tvnews_plot + theme(legend.position = "top", 
                                 axis.text.x = element_text(angle = 45, 
                                                            hjust = 0.75)) + 
               ggtitle("Minutes per month devoted to CRT\nby channel"), 
             ncol = 2,
             widths=c(0.45, 0.55))    

# Figure 2 (Google Trends, zoomed in around peak --- June 2021)

ggplot(trends %>% filter(Month_temp > as.Date("2020-01-01") & Month_temp < as.Date("2022-01-01")), aes(x = Month_temp, y = crt_us)) +
  geom_bar(stat = "identity",
           fill = "lightblue") + 
  theme_minimal() + 
  xlab("Year") + 
  ylab("Relative popularity") + 
  ggtitle("Google trends: searches for 'critical race theory' (2019-2021)") + 
  scale_x_date(date_labels = "%b %Y",
               date_breaks = "3 months") +
  geom_vline(xintercept = as.Date("2020-05-25"),
                # Murder of George Floyd
             color = "red",
             linetype = "dashed") +
  geom_vline(xintercept = as.Date("2021-04-30"),
                 # We see Republican legislators start to talk about
                 # CRT in their newsletters on this date with much more
                 # frequency. The peak is June 2021.
             color = "darkred",
             linetype = "dashed") +
  geom_vline(xintercept = as.Date("2021-06-01"), 
                 # First major uptick in Fox news coverage of CRT in the
                 # time period.
             color = "deeppink2",
             linetype = "dashed") +
  geom_text(aes(x=as.Date("2020-05-01"), 
                  # I adjust the position of text labels so 
                  # they are readable
                label="\n George Floyd is murdered", y=50), 
            colour="red",
            size = 3, angle = 90) + 
  geom_text(aes(x=as.Date("2021-04-10"), 
                label="\n Rise in Rep. newsletters w/ CRT mentions", 
                y=50), 
            colour="firebrick",
            size = 2.9, angle = 90) + 
  geom_text(aes(x=as.Date("2021-05-14"), 
                label="\n First peak in Fox News CRT mentions", 
                y=50), 
            colour="deeppink2",
            size = 3, angle = 90) + 
  theme(legend.title = element_blank(),
        plot.caption = element_text(face = "italic"),
        panel.grid.minor = element_blank()) +
  
  
  
  labs(caption = "Bar graph data sourced from Google Trends. The y-axis is relative popularity.\n A 100 on the y-axis reflects when the term was at its peak popularity, other\n values are calculated relative to that. Google trends data granularity is month-year.")

# Figure A1 (Newsletters that mention CRT as a function of all letters)
ggplot(dc_merge, aes(x = factor(clean_year), y = prop_crt, fill = Party)) +
  geom_bar(position = "dodge", stat = "identity")+
  theme_bw() + 
  ggtitle("% of Congressional newsletters that mention CRT: by year and party") +
  xlab("Year") +
  ylab("% of all newsletters") + 
  theme(legend.title = element_blank(),
        plot.caption = element_text(face = "italic"),
        panel.grid.minor = element_blank()) + 
  labs(caption = "Data sourced from DCInbox (pulled May 2023).\nNewsletters included contain the term 'CRT'\n or 'Critical Race Theory'(not case sensitive).")

# Figure A2
ggplot(trends, aes(x = Month_temp, y = crt_us)) +
  geom_bar(stat = "identity",
           fill = factor(ifelse(trends$Month_temp== as.Date("2021-06-01"),
                                "orchid1",
                                "lightblue"))) + 
  theme_minimal() + 
  xlab("Year") + 
  ylab("Relative popularity") +
  ggtitle("Google trends: searches for 'critical race theory'") + 
  scale_x_date(date_labels = "%Y",
               date_breaks = "2 years",
               limits = c(as.Date("2004-01-01"), as.Date("2023-06-01")))  +
  theme(legend.title = element_blank(),
        plot.caption = element_text(face = "italic"),
        panel.grid.minor = element_blank()) +
  
  labs(caption = "Sourced from Google Trends. The y-axis is relative popularity.\n A 100 on the y axis reflects when the term was at its peak popularity,\n other values are calculated relative to that. Data granularity is month-year.")




